Goal for the project is to Predict whether a client will subscribe (yes / no) to a term deposit, this based on data from a Portuguese bank’s direct marketing campaigns conducted via phone calls.
The dataset is structured and suitable for models like Logistic Regression, LDA, QDA, KNN, Random Forest, and others.
data <- read.csv("./bank-full.csv", header = TRUE, sep = ";", stringsAsFactors = TRUE)
bank <- data |> rename(subscribed = y)
num_rows <- nrow(bank)
num_cols <- ncol(bank) - 1
data_summary <- data.frame(
Characteristic = c("Number of Rows", "Number of Columns", "Number of Predictors", "Target Variable"),
Value = c(num_rows, num_cols, num_cols, "subscribed")
)
data_summary
summary(bank)
## age job marital education
## Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851
## 1st Qu.:33.00 management :9458 married :27214 secondary:23202
## Median :39.00 technician :7597 single :12790 tertiary :13301
## Mean :40.94 admin. :5171 unknown : 1857
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6835
## default balance housing loan contact
## no :44396 Min. : -8019 no :20081 no :37967 cellular :29285
## yes: 815 1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906
## Median : 448 unknown :13020
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
##
## day month duration campaign
## Min. : 1.00 may :13766 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 6895 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 6247 Median : 180.0 Median : 2.000
## Mean :15.81 jun : 5341 Mean : 258.2 Mean : 2.764
## 3rd Qu.:21.00 nov : 3970 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2932 Max. :4918.0 Max. :63.000
## (Other): 6060
## pdays previous poutcome subscribed
## Min. : -1.0 Min. : 0.0000 failure: 4901 no :39922
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840 yes: 5289
## Median : -1.0 Median : 0.0000 success: 1511
## Mean : 40.2 Mean : 0.5803 unknown:36959
## 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :871.0 Max. :275.0000
##
str(bank)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
## $ education : Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ subscribed: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
This data-set contains no empty values at first sight.
colSums(is.na(bank))
## age job marital education default balance housing
## 0 0 0 0 0 0 0
## loan contact day month duration campaign pdays
## 0 0 0 0 0 0 0
## previous poutcome subscribed
## 0 0 0
numeric_vars <- names(bank)[sapply(bank, is.numeric)]
categorical_vars <- names(bank)[sapply(bank, function(x) is.factor(x) || is.character(x))]
cat("Numeric variables:\n")
## Numeric variables:
print(numeric_vars)
## [1] "age" "balance" "day" "duration" "campaign" "pdays" "previous"
cat("Categorical variables:\n")
## Categorical variables:
print(categorical_vars)
## [1] "job" "marital" "education" "default" "housing"
## [6] "loan" "contact" "month" "poutcome" "subscribed"
Visual confirmation for emptyness search, no data is missing in this data set
vis_miss(bank) +
labs(
title = "Visualizing Missing Data",
x = "",
y = ""
) +
theme(
plot.title = element_text(size = 8, face = "bold"),
plot.subtitle = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1)
)
plt_subscribed <- bank |>
group_by(subscribed) |>
summarise(cnt = n()) |>
mutate(perc = round(cnt / sum(cnt), 4))
plt_prop <- ggplot(plt_subscribed, aes(x = subscribed, y = perc, colour = subscribed)) +
geom_bar(aes(fill = subscribed), show.legend = FALSE, stat = "identity") +
ylab("Proportion of Subscribed")
grid.arrange(grobs = list(tableGrob(plt_subscribed), plt_prop), ncol = 1)
categorical_vars_plt <- categorical_vars[categorical_vars != "subscribed"]
plt_categorical <- lapply(seq_along(categorical_vars_plt), function(i) {
ggplot(bank, aes_string(x = categorical_vars_plt[i], fill = "subscribed")) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_color_discrete() +
labs(title = paste("Subscription Rate by", categorical_vars_plt[i]),
y = "Proportion", x = NULL) +
coord_flip() +
theme(legend.position = if (i == 1) "bottom" else "none")
})
wrap_plots(plt_categorical, ncol = 3, guides = "collect") & theme(legend.position = "bottom")
plt_num <- lapply(numeric_vars, function(var) {
ggplot(bank, aes_string(x = "subscribed", y = var, fill = "subscribed")) +
geom_boxplot(alpha = 0.7) +
scale_color_discrete() +
labs(title = paste("Dist. of", var, "by Subscribed"), y = var, x = NULL)
})
wrap_plots(plt_num, ncol = 3) & theme(legend.position = "none")
for_outliers <- bank
for_outliers$subscribed <- ifelse(bank$subscribed == "yes", 1, 0)
model <- lm(subscribed ~ previous, data = for_outliers)
influencePlot(model, main = "Influence Plot: Cook's D vs Leverage", sub = "Size of circle = Cook's distance")
cooks_d <- cooks.distance(model)
N <- 5
top_influential_df <- data.frame(
index = 1:length(cooks_d),
cooks_distance = cooks_d,
previous = bank$previous,
subscribed = bank$subscribed
) |>
arrange(desc(cooks_distance)) |>
slice(1:N)
ggplot(top_influential_df, aes(x = reorder(as.factor(index), -cooks_distance), y = cooks_distance)) +
geom_col(fill = base_palette[3]) +
geom_text(aes(label = round(cooks_distance, 4)), vjust = -0.5, size = 3.5) +
labs(
title = paste("Top", N, "Influential Points (Cook's Distance)"),
x = "Observation Index",
y = "Cook's Distance"
)
Observation 29183 has extremely high leverage and Cook’s Distance (~44.3), making it a highly influential point in the model.
bank <- bank[-29183, ]
for_outliers <- bank
for_outliers$subscribed <- ifelse(bank$subscribed == "yes", 1, 0)
model <- lm(subscribed ~ previous, data = for_outliers)
influencePlot(model, main = "Influence Plot: Cook's D vs Leverage", sub = "Size of circle = Cook's distance")
for_corr <- bank
for_corr$subscribed <- ifelse(bank$subscribed == "yes", 1, 0)
vars_corr <- names(for_corr)[sapply(for_corr, is.numeric)]
corr_df <- for_corr[vars_corr]
cor_matrix <- cor(corr_df, use = "complete.obs")
subscribed_cor <- cor_matrix[, "subscribed", drop = FALSE]
subscribed_cor <- subscribed_cor[order(abs(subscribed_cor[,1]), decreasing = TRUE), , drop = FALSE]
cor_df <- data.frame(
variable = rownames(subscribed_cor),
correlation = subscribed_cor[,1]
)
ggplot(cor_df, aes(x = reorder(variable, correlation), y = correlation)) +
geom_bar(stat = "identity", fill = base_palette[3]) +
coord_flip() +
labs(title = "Correlation with Subscribed", x = "Variable", y = "Correlation")
for_pca <- bank
for_pca <- bank[sapply(data, is.numeric)]
pca <- prcomp(for_pca, scale. = TRUE)
autoplot(pca, data = bank, colour = 'subscribed', loadings = TRUE, loadings.label = TRUE) +
labs(title = "PCA")
loadings <- as.data.frame(pca$rotation)
loadings$variable <- rownames(loadings)
loadings$PC1.ABS <- abs(loadings$PC1)
loadings$PC2.ABS <- abs(loadings$PC2)
top_pc1 <- loadings[order(-loadings$PC1.ABS), c("variable", "PC1")][1:7, ]
top_pc2 <- loadings[order(-loadings$PC2.ABS), c("variable", "PC2")][1:7, ]
top_combined <- data.frame(
PC1_Variable = top_pc1$variable,
PC1_Loading = round(top_pc1$PC1, 3),
PC2_Variable = top_pc2$variable,
PC2_Loading = round(top_pc2$PC2, 3)
)
top_combined
Principal Component Analysis (PCA) on the numeric variables revealed that pdays and previous contributed most to the first principal component (PC1), capturing variability related to past campaign exposure. The second component (PC2) was primarily influenced by campaign, day, and negatively by duration, reflecting variation in campaign intensity and contact timing.
We can interpret the components as:
eigenvals <- pca$sdev^2
plot(eigenvals / sum(eigenvals), type = "l", main = "Scree Plot", ylab = "Prop. Var. Explained", xlab = "PC #", ylim = c(0, 1))
cumulative.prop <- cumsum(eigenvals / sum(eigenvals))
lines(cumulative.prop, lty = 2)
eigenvals <- pca$sdev^2
prop_var <- eigenvals / sum(eigenvals)
cum_var <- cumsum(prop_var)
pc_table <- data.frame(
PC = paste0("PC", 1:length(prop_var)),
"Variance Explained" = round(prop_var, 4),
"Cumulative Variance" = round(cum_var, 4)
)
pc_table
pca_scores <- as.data.frame(pca$x)
pca_scores$subscribed <- bank$subscribed
plot_ly(
data = pca_scores,
x = ~PC1, y = ~PC2, z = ~PC3,
color = ~subscribed,
colors = c(base_palette[3], "skyblue"),
type = "scatter3d",
mode = "markers"
)
It appears that we’re missing a significant portion of the variance by focusing only on the numeric variables. PC1 and PC2 explain the most variation among these, but even after adding PC3, we don’t observe meaningful separation between subscription outcomes. This suggests that additional structure — possibly critical for understanding or predicting subscribed — may lie in the categorical variables, which were not included in this PCA.
Let’s understand a bit how the numerals contribute to explain the subscription
for_glm <- bank
for_glm$subscribed <- ifelse(for_glm$subscribed == "yes", 1, 0)
all_num_additive <- glm(subscribed ~ duration + pdays + previous + campaign + balance + age + day, data = for_glm, family = binomial)
summary(all_num_additive)
##
## Call:
## glm(formula = subscribed ~ duration + pdays + previous + campaign +
## balance + age + day, family = binomial, data = for_glm)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.472e+00 7.700e-02 -45.085 < 2e-16 ***
## duration 3.643e-03 5.648e-05 64.500 < 2e-16 ***
## pdays 1.964e-03 1.552e-04 12.654 < 2e-16 ***
## previous 1.006e-01 7.340e-03 13.709 < 2e-16 ***
## campaign -1.292e-01 9.609e-03 -13.445 < 2e-16 ***
## balance 3.704e-05 4.294e-06 8.625 < 2e-16 ***
## age 7.910e-03 1.473e-03 5.370 7.88e-08 ***
## day -1.661e-03 2.014e-03 -0.825 0.409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32631 on 45209 degrees of freedom
## Residual deviance: 26464 on 45202 degrees of freedom
## AIC: 26480
##
## Number of Fisher Scoring iterations: 6
tidy(all_num_additive, conf.int = TRUE) |>
filter(term != "(Intercept)") |>
ggplot(aes(x = reorder(term, estimate), y = estimate)) +
geom_point() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.1) +
coord_flip() +
labs(title = "Logistic Regression Coefficients",
y = "Estimate (log-odds)", x = "Variable")
Modeling with all categoricals might tell some story
bank_cat <- bank
cat_model_vars <- setdiff(categorical_vars, "subscribed")
model_glm_cat <- as.formula(paste("subscribed ~", paste(cat_model_vars, collapse = " + ")))
glm_cats <- glm(model_glm_cat, data = bank_cat, family = binomial)
summary(glm_cats)
##
## Call:
## glm(formula = model_glm_cat, family = binomial, data = bank_cat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.24333 0.10655 -11.669 < 2e-16 ***
## jobblue-collar -0.12464 0.06490 -1.921 0.054785 .
## jobentrepreneur -0.19100 0.11110 -1.719 0.085581 .
## jobhousemaid -0.28253 0.11968 -2.361 0.018238 *
## jobmanagement -0.04697 0.06561 -0.716 0.474084
## jobretired 0.46523 0.07788 5.974 2.32e-09 ***
## jobself-employed -0.09072 0.09911 -0.915 0.360040
## jobservices -0.08617 0.07451 -1.156 0.247478
## jobstudent 0.33069 0.09770 3.385 0.000713 ***
## jobtechnician -0.06466 0.06187 -1.045 0.295967
## jobunemployed 0.13108 0.09768 1.342 0.179626
## jobunknown -0.19401 0.20780 -0.934 0.350479
## maritalmarried -0.20629 0.05158 -4.000 6.35e-05 ***
## maritalsingle 0.08241 0.05554 1.484 0.137862
## educationsecondary 0.15081 0.05652 2.668 0.007627 **
## educationtertiary 0.31779 0.06570 4.837 1.32e-06 ***
## educationunknown 0.20162 0.09242 2.182 0.029137 *
## defaultyes -0.15693 0.14688 -1.068 0.285318
## housingyes -0.54527 0.03810 -14.313 < 2e-16 ***
## loanyes -0.40804 0.05303 -7.694 1.43e-14 ***
## contacttelephone -0.28127 0.06399 -4.395 1.11e-05 ***
## contactunknown -1.34752 0.06337 -21.263 < 2e-16 ***
## monthaug -0.97784 0.06846 -14.284 < 2e-16 ***
## monthdec 0.57038 0.16198 3.521 0.000429 ***
## monthfeb -0.44644 0.07501 -5.952 2.65e-09 ***
## monthjan -1.08345 0.10618 -10.203 < 2e-16 ***
## monthjul -0.79694 0.06766 -11.779 < 2e-16 ***
## monthjun 0.10830 0.08091 1.338 0.180741
## monthmar 1.06567 0.11027 9.664 < 2e-16 ***
## monthmay -0.50694 0.06341 -7.995 1.30e-15 ***
## monthnov -0.83485 0.07443 -11.216 < 2e-16 ***
## monthoct 0.68172 0.09776 6.973 3.10e-12 ***
## monthsep 0.65425 0.10739 6.092 1.11e-09 ***
## poutcomeother 0.25429 0.07960 3.194 0.001401 **
## poutcomesuccess 2.26565 0.07345 30.848 < 2e-16 ***
## poutcomeunknown 0.03496 0.05155 0.678 0.497693
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32631 on 45209 degrees of freedom
## Residual deviance: 27296 on 45174 degrees of freedom
## AIC: 27368
##
## Number of Fisher Scoring iterations: 6
X_cat <- model.matrix(model_glm_cat, data = bank_cat)
reference_lvls <- data.frame(
Variable = cat_model_vars,
Reference = sapply(bank_cat[cat_model_vars], function(x) levels(x)[1])
) |> tibble::as_tibble()
reference_lvls
odds_ratios <- data.frame(
Variable = names(coef(glm_cats)),
Odds_Ratio = exp(coef(glm_cats))
) |> dplyr::mutate(Effect = paste0(round((Odds_Ratio - 1) * 100, 1), "%")) |>
dplyr::mutate(Odds_Ratio = round(Odds_Ratio, 3)) |>
dplyr::arrange(desc(Odds_Ratio)) |>
tibble::as_tibble()
odds_ratios
We can see from the categorical only variables that:
| Variable | Visual Pattern? | Clear % Difference? | Keep? |
|---|---|---|---|
| contact | yes | yes (cellular > unknown) | yes |
| loan | yes | yes (loan = less likely) | yes |
| housing | yes | yes (housing = likely) | yes |
| default | maybe | some | maybe |
| education | yes | yes (tertiary increases) | yes |
| poutcome | strong | yes (success = very high) | yes |
| marital | maybe | some separation | maybe |
| job | mixed | a few clear signals | maybe (group rare levels) |
Carefully look at poutcome as we do not know what drives from previous mkt approach, and if the customer is showing an affinity to long term deposit, maybe is increasing the current deposit. Default sounds like a good story, I tried swapping ref with not much difference.
We are having very conflicting results based on the multiple explorations on numerical, that is telling us, that we need categorical variables to play a role in the explainability.
We think that cyclical encoding for month as we see some patterns on specific months could help the model to explain better as seasonality seems to have some effect.
| Variable | Type | Reason for Inclusion |
|---|---|---|
| duration | Numerical | Strongest univariate predictor; higher durations consistently increase subscription odds |
| pdays | Numerical | Captures time since last contact; reflects engagement recency |
| previous | Numerical | Reflects past campaign success; useful but may be redundant with pdays/campaign |
| balance | Numerical | Indicates client financial status; moderate predictive signal |
| campaign | Numerical | Current campaign intensity; negative association suggests fatigue with repeated contact |
| month_sin | Numerical | Cyclical encoding of month (seasonality); preserves circular month structure |
| month_cos | Numerical | Complement to month_sin; together capture monthly cyclic patterns |
| contact | Categorical | Clear visual and statistical difference; contact method affects likelihood to subscribe |
| loan | Categorical | Customers with loans are less likely to subscribe; simple and interpretable |
| education | Categorical | Higher education levels (tertiary) correlate with higher subscription odds |
| marital | Categorical | Some variation observed; potentially useful with clear reference level |
| job | Categorical | Certain job roles (retired, student) show increased subscription; use with level grouping |
Will do the cyclical encoding for month and dummy variables (as they are factors GLM will dummy them)
candidate_data <- bank
candidate_data$month_num <- as.numeric(factor(candidate_data$month, levels = c(
"jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec"
)))
candidate_data$month_sin <- sin(2 * pi * candidate_data$month_num / 12)
candidate_data$month_cos <- cos(2 * pi * candidate_data$month_num / 12)
candidate_data <- candidate_data |> dplyr::select(-month)
head(candidate_data)
split_rate <- 0.7
split <- sample(1:nrow(candidate_data), split_rate * nrow(candidate_data))
train_data <- candidate_data[split, ]
test_data <- candidate_data[-split, ]
#num_feat <- c("duration", "poutcome", "pdays", "balance", "default", "housing", "campaign", "month_sin", "month_cos")
num_feat <- c("duration", "balance", "campaign", "month_sin", "month_cos")
cat_feat <- c("contact", "loan", "education", "marital", "job", "day", "previous", "poutcome", "housing")
features <- c(num_feat, cat_feat)
candidate_model <- as.formula(paste("subscribed ~", paste(features, collapse = " + ")))
threshold <- 0.25
candidate_fit <- glm(candidate_model, data = train_data, family = binomial)
glm_pred <- predict(candidate_fit, newdata = test_data, type = "response")
model_levels <- levels(candidate_data$subscribed)
pred_class <- factor(ifelse(glm_pred > threshold, "yes", "no"), levels = model_levels)
glm_actual <- factor(test_data$subscribed, levels = model_levels)
confusionMatrix(pred_class, glm_actual, positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11223 745
## yes 725 871
##
## Accuracy : 0.8916
## 95% CI : (0.8863, 0.8968)
## No Information Rate : 0.8809
## P-Value [Acc > NIR] : 4.708e-05
##
## Kappa : 0.4809
##
## Mcnemar's Test P-Value : 0.6202
##
## Sensitivity : 0.53899
## Specificity : 0.93932
## Pos Pred Value : 0.54574
## Neg Pred Value : 0.93775
## Prevalence : 0.11914
## Detection Rate : 0.06421
## Detection Prevalence : 0.11766
## Balanced Accuracy : 0.73915
##
## 'Positive' Class : yes
##
thresholds <- seq(0.1, 0.9, by = 0.01)
metrics_df <- purrr::map_dfr(thresholds, function(thresh) {
pred_class <- factor(ifelse(glm_pred > thresh, "yes", "no"), levels = c("no", "yes"))
tibble(
threshold = thresh,
precision = yardstick::precision_vec(truth = glm_actual, estimate = pred_class, event_level = "second"),
recall = yardstick::recall_vec(truth = glm_actual, estimate = pred_class, event_level = "second"),
f1 = yardstick::f_meas_vec(truth = glm_actual, estimate = pred_class, event_level = "second")
)
})
ggplot(metrics_df, aes(x = threshold)) +
geom_line(aes(y = f1), color = base_palette[1]) +
geom_line(aes(y = precision), color = base_palette[2]) +
geom_line(aes(y = recall), color = base_palette[3]) +
labs(title = "Threshold Tuning", y = "Metric", x = "Threshold")
test_data_aug <- test_data %>%
dplyr::mutate(
pred_prob = glm_pred,
pred_class = factor(ifelse(glm_pred > threshold, "yes", "no"), levels = c("no", "yes")),
subscribed = factor(subscribed, levels = c("no", "yes")),
result = case_when(
subscribed == "yes" & pred_class == "yes" ~ "TP",
subscribed == "no" & pred_class == "yes" ~ "FP",
subscribed == "yes" & pred_class == "no" ~ "FN",
subscribed == "no" & pred_class == "no" ~ "TN"
)
)
ggplot(test_data_aug, aes(x = duration, y = balance, color = result)) +
geom_point(alpha = 0.4) +
labs(title = "False Positives vs. True Positives",
subtitle = paste("Threshold:", threshold),
color = "Prediction Outcome")
test_data$subscribed <- factor(test_data$subscribed, levels = c("no", "yes"))
pr_df <- tibble(
subscribed = test_data$subscribed,
.pred_yes = glm_pred
)
pr_curve(pr_df, truth = subscribed, .pred_yes) %>%
autoplot() +
labs(
title = "Precision-Recall Curve",
subtitle = "Probability thresholds for predicting 'yes'"
)
## Registered S3 method overwritten by 'parsnip':
## method from
## autoplot.glmnet ggfortify
bank_loess <- bank
bank_loess$subscribed <- ifelse(bank_loess$subscribed == "yes", 1, 0)
plt_numeric_vars <- setdiff(numeric_vars, c())
plots <- lapply(plt_numeric_vars, function(var) {
ggplot(bank_loess, aes_string(x = var, y = "subscribed", colour = var)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, size = 1, span = 1.5) +
ylim(-.2, 1.2) +
labs(title = paste("Subscription vs", var), y = "Subscription Rate", x = var)
})
grid.arrange(grobs = plots, ncol = 3)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
| Variable | Shape | Interpretation | Suggestion |
|---|---|---|---|
age |
U-shape → increasing | Older clients tend to subscribe more; age shows a non-linear effect | Include age^2 |
balance |
Inverted U-shape | Clients with mid-range balances are more likely to subscribe | Use balance + balance^2 |
duration |
Logistic-like rise then plateau | Longer calls are associated with higher subscription rates, with signs of saturation | Consider log(duration + 1) or natural spline |
campaign |
Shallow U-shape | Additional contacts may reduce effectiveness after a point | Keep as linear or bin into tiers |
pdays |
Flat with mild positive curvature | Values like -1 dominate; higher values show a weak increasing trend | Cap at 300 or bin; optionally include pdays == -1
indicator |
previous |
Curved rise then decline | Moderate prior contact increases success; too many may reduce effectiveness | Use previous + previous^2 or
log(previous + 1) |
#' By domain knowledge we explore couple interactions.
#' - duration × poutcome: Duration and previous outcome can drive success
#' - duration × contact: Communication channel can affect duration of calls
#' - job × education: Job and education can have strong relationships
#' - housing × loan: Debt-free can drive decisions regarding financial risks
inter_data <- candidate_data
inter_data$subscribed <- ifelse(inter_data$subscribed == "yes", 1, 0)
iner_job_data <- inter_data |>
group_by(job, education) |>
summarise(subscribe_rate = mean(subscribed), .groups = "drop")
housing_loan_data <- inter_data %>%
group_by(housing, loan) %>%
summarise(rate = mean(subscribed), .groups = "drop")
plt_outcome <- ggplot(inter_data, aes(y = subscribed, x = duration, colour = poutcome)) +
geom_point() +
labs(
title = "Subscription Rate by Duration × Outcome",
) +
geom_smooth(method = "loess", size = 1, span = .5) +
ylim(-.2, 1)
plt_dur_ct <- ggplot(inter_data, aes(y = subscribed, x = duration, colour = contact)) +
geom_point() +
labs(
title = "Subscription Rate by Duration × Contact",
) +
geom_smooth(method = "loess", size = 1, span = .5) +
ylim(-.2, 1)
plt_job <- ggplot(iner_job_data, aes(x = education, y = job, fill = subscribe_rate)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = base_palette[3], name = "Subscription Rate") +
labs(
title = "Subscription Rate by Job × Education",
x = "Education Level", y = "Job"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plt_house_loan <- ggplot(housing_loan_data, aes(x = housing, y = rate, fill = loan)) +
geom_col(position = "dodge") +
labs(
title = "Subscription Rate by Housing × Loan",
x = "Housing Loan", y = "Subscription Rate"
)
(plt_outcome + plt_dur_ct) / (plt_job + plt_house_loan)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#' Local Variables
#' - Setting threshold for all models - might need adjustment for each; but the goal is to compare in a fair ground
#' - Train Control for all models, same cross validation. Can be set individually for each model
#' - Set Reference Levels
threshold <- 0.20
model_levels <- levels(candidate_data$subscribed)
fitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 1,
classProbs = TRUE,
summaryFunction = mnLogLoss
)
drop_num <- c("pdays")
drop_cat <- c("month", "subscribed")
no_transform <- c("month_sin", "month_cos", "day")
numeric_clean <- setdiff(numeric_vars, drop_num)
categorical_clean <- setdiff(categorical_vars, drop_cat)
existing_vars <- names(candidate_data)
direct_numeric <- intersect(setdiff(no_transform, drop_num), existing_vars)
# For GLM/LDA models
num_poly <- setdiff(numeric_clean, no_transform)
poly_terms <- paste0("poly(", num_poly, ", 2, raw = TRUE)")
interaction_terms <- c(
"poly(duration, 2, raw = TRUE):poutcome",
"poly(duration, 2, raw = TRUE):contact",
"job:education",
"housing:loan"
)
formula_terms_full <- c(poly_terms, direct_numeric, categorical_clean, interaction_terms)
candidate_formula_string <- paste("subscribed ~", paste(formula_terms_full, collapse = " + "))
candidate_model <- as.formula(candidate_formula_string)
# Random Forest version
formula_terms_rf <- intersect(c(numeric_clean, direct_numeric, categorical_clean), existing_vars)
rf_formula_string <- paste("subscribed ~", paste(formula_terms_rf, collapse = " + "))
rf_candidate_model <- as.formula(rf_formula_string)
candidate_model
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) +
## poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) +
## poly(previous, 2, raw = TRUE) + month_sin + month_cos + day +
## job + marital + education + default + housing + loan + contact +
## poutcome + poly(duration, 2, raw = TRUE):poutcome + poly(duration,
## 2, raw = TRUE):contact + job:education + housing:loan
rf_candidate_model
## subscribed ~ age + balance + day + duration + campaign + previous +
## month_sin + month_cos + job + marital + education + default +
## housing + loan + contact + poutcome
step_model <- stepAIC(
glm(candidate_model, data = train_data, family = binomial),
scope = list(lower = ~ month_cos, upper = candidate_model),
direction = "both"
)
## Start: AIC=14893.02
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) +
## poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) +
## poly(previous, 2, raw = TRUE) + month_sin + month_cos + day +
## job + marital + education + default + housing + loan + contact +
## poutcome + poly(duration, 2, raw = TRUE):poutcome + poly(duration,
## 2, raw = TRUE):contact + job:education + housing:loan
##
## Df Deviance AIC
## - job:education 33 14792 14890
## - day 1 14729 14891
## - default 1 14729 14891
## <none> 14729 14893
## - poly(previous, 2, raw = TRUE) 2 14735 14895
## - housing:loan 1 14743 14905
## - marital 2 14752 14912
## - poly(balance, 2, raw = TRUE) 2 14756 14916
## - month_sin 1 14766 14928
## - poly(duration, 2, raw = TRUE):poutcome 6 14788 14940
## - poly(duration, 2, raw = TRUE):contact 4 14792 14948
## - poly(campaign, 2, raw = TRUE) 2 14797 14957
## - poly(age, 2, raw = TRUE) 2 14815 14975
##
## Step: AIC=14889.51
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) +
## poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) +
## poly(previous, 2, raw = TRUE) + month_sin + month_cos + day +
## job + marital + education + default + housing + loan + contact +
## poutcome + poly(duration, 2, raw = TRUE):poutcome + poly(duration,
## 2, raw = TRUE):contact + housing:loan
##
## Df Deviance AIC
## - day 1 14792 14888
## - default 1 14792 14888
## <none> 14792 14890
## - poly(previous, 2, raw = TRUE) 2 14798 14892
## + job:education 33 14729 14893
## - housing:loan 1 14806 14902
## - marital 2 14814 14908
## - education 3 14816 14908
## - poly(balance, 2, raw = TRUE) 2 14819 14913
## - month_sin 1 14827 14923
## - job 11 14851 14927
## - poly(duration, 2, raw = TRUE):poutcome 6 14851 14937
## - poly(duration, 2, raw = TRUE):contact 4 14855 14945
## - poly(campaign, 2, raw = TRUE) 2 14862 14956
## - poly(age, 2, raw = TRUE) 2 14881 14975
##
## Step: AIC=14887.51
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) +
## poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) +
## poly(previous, 2, raw = TRUE) + month_sin + month_cos + job +
## marital + education + default + housing + loan + contact +
## poutcome + poly(duration, 2, raw = TRUE):poutcome + poly(duration,
## 2, raw = TRUE):contact + housing:loan
##
## Df Deviance AIC
## - default 1 14792 14886
## <none> 14792 14888
## + day 1 14792 14890
## - poly(previous, 2, raw = TRUE) 2 14798 14890
## + job:education 33 14729 14891
## - housing:loan 1 14806 14900
## - marital 2 14814 14906
## - education 3 14816 14906
## - poly(balance, 2, raw = TRUE) 2 14819 14911
## - month_sin 1 14828 14922
## - job 11 14851 14925
## - poly(duration, 2, raw = TRUE):poutcome 6 14851 14935
## - poly(duration, 2, raw = TRUE):contact 4 14855 14943
## - poly(campaign, 2, raw = TRUE) 2 14863 14955
## - poly(age, 2, raw = TRUE) 2 14881 14973
##
## Step: AIC=14885.56
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) +
## poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) +
## poly(previous, 2, raw = TRUE) + month_sin + month_cos + job +
## marital + education + housing + loan + contact + poutcome +
## poly(duration, 2, raw = TRUE):poutcome + poly(duration, 2,
## raw = TRUE):contact + housing:loan
##
## Df Deviance AIC
## <none> 14792 14886
## + default 1 14792 14888
## + day 1 14792 14888
## - poly(previous, 2, raw = TRUE) 2 14798 14888
## + job:education 33 14729 14889
## - housing:loan 1 14806 14898
## - marital 2 14814 14904
## - education 3 14816 14904
## - poly(balance, 2, raw = TRUE) 2 14819 14909
## - month_sin 1 14828 14920
## - job 11 14851 14923
## - poly(duration, 2, raw = TRUE):poutcome 6 14851 14933
## - poly(duration, 2, raw = TRUE):contact 4 14855 14941
## - poly(campaign, 2, raw = TRUE) 2 14863 14953
## - poly(age, 2, raw = TRUE) 2 14881 14971
selected_formula <- formula(step_model)
summary(step_model)
##
## Call:
## glm(formula = subscribed ~ poly(age, 2, raw = TRUE) + poly(balance,
## 2, raw = TRUE) + poly(duration, 2, raw = TRUE) + poly(campaign,
## 2, raw = TRUE) + poly(previous, 2, raw = TRUE) + month_sin +
## month_cos + job + marital + education + housing + loan +
## contact + poutcome + poly(duration, 2, raw = TRUE):poutcome +
## poly(duration, 2, raw = TRUE):contact + housing:loan, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.034e+00 3.394e-01 -3.048
## poly(age, 2, raw = TRUE)1 -1.080e-01 1.214e-02 -8.894
## poly(age, 2, raw = TRUE)2 1.243e-03 1.325e-04 9.379
## poly(balance, 2, raw = TRUE)1 6.830e-05 1.379e-05 4.952
## poly(balance, 2, raw = TRUE)2 -2.803e-09 7.036e-10 -3.983
## poly(duration, 2, raw = TRUE)1 8.312e-03 5.873e-04 14.152
## poly(duration, 2, raw = TRUE)2 -3.905e-06 4.389e-07 -8.895
## poly(campaign, 2, raw = TRUE)1 -1.283e-01 1.556e-02 -8.247
## poly(campaign, 2, raw = TRUE)2 3.186e-03 5.961e-04 5.344
## poly(previous, 2, raw = TRUE)1 6.353e-02 2.650e-02 2.397
## poly(previous, 2, raw = TRUE)2 -2.815e-03 1.404e-03 -2.005
## month_sin 2.119e-01 3.529e-02 6.004
## month_cos -1.450e-02 3.477e-02 -0.417
## jobblue-collar -4.093e-01 8.696e-02 -4.706
## jobentrepreneur -5.109e-01 1.503e-01 -3.399
## jobhousemaid -6.370e-01 1.640e-01 -3.885
## jobmanagement -2.297e-01 8.890e-02 -2.584
## jobretired -2.592e-01 1.297e-01 -1.999
## jobself-employed -4.024e-01 1.332e-01 -3.022
## jobservices -3.923e-01 1.011e-01 -3.879
## jobstudent 3.006e-01 1.336e-01 2.250
## jobtechnician -2.052e-01 8.233e-02 -2.493
## jobunemployed -2.663e-01 1.339e-01 -1.989
## jobunknown -6.657e-01 3.157e-01 -2.108
## maritalmarried -2.158e-01 6.981e-02 -3.092
## maritalsingle 1.394e-02 8.033e-02 0.174
## educationsecondary 1.729e-01 7.730e-02 2.237
## educationtertiary 4.218e-01 9.068e-02 4.652
## educationunknown 2.393e-01 1.234e-01 1.939
## housingyes -8.946e-01 5.265e-02 -16.991
## loanyes -7.757e-01 1.011e-01 -7.675
## contacttelephone -3.302e-04 1.794e-01 -0.002
## contactunknown -2.250e+00 1.995e-01 -11.275
## poutcomeother 4.759e-01 2.478e-01 1.921
## poutcomesuccess 2.599e+00 2.324e-01 11.185
## poutcomeunknown -4.610e-02 1.708e-01 -0.270
## poly(duration, 2, raw = TRUE)1:poutcomeother -1.081e-03 9.814e-04 -1.102
## poly(duration, 2, raw = TRUE)2:poutcomeother 5.945e-07 7.281e-07 0.817
## poly(duration, 2, raw = TRUE)1:poutcomesuccess -1.401e-03 1.025e-03 -1.368
## poly(duration, 2, raw = TRUE)2:poutcomesuccess 5.163e-07 7.987e-07 0.646
## poly(duration, 2, raw = TRUE)1:poutcomeunknown -1.309e-03 6.204e-04 -2.110
## poly(duration, 2, raw = TRUE)2:poutcomeunknown 1.714e-06 4.575e-07 3.746
## poly(duration, 2, raw = TRUE)1:contacttelephone -4.414e-04 6.154e-04 -0.717
## poly(duration, 2, raw = TRUE)2:contacttelephone 8.651e-08 3.705e-07 0.233
## poly(duration, 2, raw = TRUE)1:contactunknown 2.163e-03 4.850e-04 4.461
## poly(duration, 2, raw = TRUE)2:contactunknown -4.756e-07 2.611e-07 -1.822
## housingyes:loanyes 5.211e-01 1.389e-01 3.751
## Pr(>|z|)
## (Intercept) 0.002306 **
## poly(age, 2, raw = TRUE)1 < 2e-16 ***
## poly(age, 2, raw = TRUE)2 < 2e-16 ***
## poly(balance, 2, raw = TRUE)1 7.34e-07 ***
## poly(balance, 2, raw = TRUE)2 6.80e-05 ***
## poly(duration, 2, raw = TRUE)1 < 2e-16 ***
## poly(duration, 2, raw = TRUE)2 < 2e-16 ***
## poly(campaign, 2, raw = TRUE)1 < 2e-16 ***
## poly(campaign, 2, raw = TRUE)2 9.08e-08 ***
## poly(previous, 2, raw = TRUE)1 0.016533 *
## poly(previous, 2, raw = TRUE)2 0.044912 *
## month_sin 1.93e-09 ***
## month_cos 0.676588
## jobblue-collar 2.53e-06 ***
## jobentrepreneur 0.000675 ***
## jobhousemaid 0.000102 ***
## jobmanagement 0.009768 **
## jobretired 0.045641 *
## jobself-employed 0.002512 **
## jobservices 0.000105 ***
## jobstudent 0.024419 *
## jobtechnician 0.012681 *
## jobunemployed 0.046696 *
## jobunknown 0.034998 *
## maritalmarried 0.001989 **
## maritalsingle 0.862208
## educationsecondary 0.025275 *
## educationtertiary 3.29e-06 ***
## educationunknown 0.052489 .
## housingyes < 2e-16 ***
## loanyes 1.65e-14 ***
## contacttelephone 0.998532
## contactunknown < 2e-16 ***
## poutcomeother 0.054782 .
## poutcomesuccess < 2e-16 ***
## poutcomeunknown 0.787259
## poly(duration, 2, raw = TRUE)1:poutcomeother 0.270473
## poly(duration, 2, raw = TRUE)2:poutcomeother 0.414213
## poly(duration, 2, raw = TRUE)1:poutcomesuccess 0.171434
## poly(duration, 2, raw = TRUE)2:poutcomesuccess 0.517963
## poly(duration, 2, raw = TRUE)1:poutcomeunknown 0.034882 *
## poly(duration, 2, raw = TRUE)2:poutcomeunknown 0.000180 ***
## poly(duration, 2, raw = TRUE)1:contacttelephone 0.473212
## poly(duration, 2, raw = TRUE)2:contacttelephone 0.815396
## poly(duration, 2, raw = TRUE)1:contactunknown 8.17e-06 ***
## poly(duration, 2, raw = TRUE)2:contactunknown 0.068493 .
## housingyes:loanyes 0.000176 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22723 on 31645 degrees of freedom
## Residual deviance: 14792 on 31599 degrees of freedom
## AIC: 14886
##
## Number of Fisher Scoring iterations: 7
#' Simple Functions to avoid repeating a lot
#' Compute RMSE, LOGLOSS, and a Readeable Table for metrics
threshold <- 0.25
actual_preds <- factor(test_data$subscribed, levels = model_levels)
custom.rmse <- function(predicted, actual) {
sqrt(mean((predicted - ifelse(actual == "yes", 1, 0))^2))
}
custom.logloss <- function(probs, actual, eps = 1e-15) {
probs <- pmin(pmax(probs, eps), 1 - eps)
actual_binary <- ifelse(actual == "yes", 1, 0)
-mean(actual_binary * log(probs) + (1 - actual_binary) * log(1 - probs))
}
custom.results <- function(model_fit) {
probs <- predict(model_fit, newdata = test_data, type = "prob")[, "yes"]
classification <- factor(ifelse(probs > threshold, "yes", "no"), levels = model_levels)
cm <- confusionMatrix(data = classification, reference = actual_preds, positive = "yes")
list(
Probabilities = probs,
#Classification = classification,
ConfusionMatrix = cm
)
}
custom.metrics <- function(cm, fit, roc, probs) {
sensitivity <- cm$byClass["Sensitivity"]
ppv <- cm$byClass["Pos Pred Value"]
f1_score <- if ((sensitivity + ppv) > 0) {
2 * ((sensitivity * ppv) / (sensitivity + ppv))
} else {
NA
}
log_loss <- custom.logloss(probs, actual_preds)
rmse <- custom.rmse(probs, actual_preds)
aic <- NA
if (!is.null(fit$finalModel) && "aic" %in% names(fit$finalModel)) {
aic <- fit$finalModel$aic
} else if (inherits(fit$finalModel, "glm")) {
aic <- AIC(fit$finalModel)
}
data.frame(
RMSE = rmse,
AIC = aic,
Accuracy = cm$overall["Accuracy"],
Sensitivity = sensitivity,
Specificity = cm$byClass["Specificity"],
PPV = ppv,
NPV = cm$byClass["Neg Pred Value"],
Prevalence = cm$byClass["Prevalence"],
AUROC = auc(roc),
LogLoss = log_loss,
F1 = f1_score
)
}
set.seed(42)
glm_fit <- train(
candidate_model,
data = train_data,
method = "glm",
family = "binomial",
trControl = fitControl,
metric = "logLoss"
)
glm_fit_selected <- train(
selected_formula,
data = train_data,
method = "glm",
family = "binomial",
trControl = fitControl,
metric = "logLoss"
)
glm_results <- custom.results(glm_fit)
glm_results_selected <- custom.results(glm_fit_selected)
set.seed(42)
lda_fit <- train(
candidate_model,
data = train_data,
method = "lda",
trControl = fitControl,
metric = "logLoss"
)
lda_fit_selected <- train(
selected_formula,
data = train_data,
method = "lda",
trControl = fitControl,
metric = "logLoss"
)
lda_results <- custom.results(lda_fit)
lda_results_selected <- custom.results(lda_fit_selected)
knn_fit <- train(
rf_candidate_model,
data = train_data,
method = "knn",
preProcess = c("center", "scale"),
trControl = fitControl,
tuneLength = 5
)
knn_results <- custom.results(knn_fit)
set.seed(42)
rf_fit <- train(
rf_candidate_model,
data = train_data,
method = "ranger",
trControl = fitControl,
metric = "logLoss",
tuneLength = 5
)
## Growing trees.. Progress: 77%. Estimated remaining time: 9 seconds.
## Growing trees.. Progress: 68%. Estimated remaining time: 14 seconds.
## Growing trees.. Progress: 90%. Estimated remaining time: 4 seconds.
rf_results <- custom.results(rf_fit)
roc_lda <- roc(actual_preds, lda_results$Probabilities, levels = model_levels, direction = "<")
roc_glm <- roc(actual_preds, glm_results$Probabilities, levels = model_levels, direction = "<")
roc_rf <- roc(actual_preds, rf_results$Probabilities, levels = model_levels, direction = "<")
roc_knn <- roc(actual_preds, knn_results$Probabilities, levels = model_levels, direction = "<")
#' Hybrid Selected ROC
plot(roc_lda, col = base_palette[1], lwd = 2, main = "ROC Curve: LDA vs Logistic vs Random Forest")
lines(roc_glm, col = base_palette[2], lwd = 2)
lines(roc_rf, col = base_palette[3], lwd = 2)
lines(roc_knn, col = base_palette[4], lwd = 2)
legend("bottomright",
legend = c(
paste("LDA (AUC =", round(auc(roc_lda), 3), ")"),
paste("Logistic (AUC =", round(auc(roc_glm), 3), ")"),
paste("Random Forest (AUC =", round(auc(roc_rf), 3), ")"),
paste("KNN (AUC =", round(auc(roc_knn), 3), ")")
),
col = base_palette[1:4],
lwd = 4
)
#' Feature Selected ROC
roc_lda_selected <- roc(actual_preds, lda_results_selected$Probabilities, levels = model_levels, direction = "<")
roc_glm_selected <- roc(actual_preds, glm_results_selected$Probabilities, levels = model_levels, direction = "<")
#' Feature Selected ROC
plot(roc_lda_selected, col = base_palette[7], lwd = 2, main = "ROC Curve: LDA vs Logistic - (Feature Selected)")
lines(roc_glm_selected, col = base_palette[8], lwd = 2)
legend("bottomright",
legend = c(
paste("LDA (AUC =", round(auc(roc_lda_selected), 3), ")"),
paste("Logistic (AUC =", round(auc(roc_glm_selected), 3), ")")
),
col = base_palette[7:8],
lwd = 4
)
results_table <- rbind(
LDA = custom.metrics(lda_results$ConfusionMatrix, lda_fit, roc_lda, lda_results$Probabilities),
LDA.FeatureSelected = custom.metrics(lda_results_selected$ConfusionMatrix, lda_fit_selected, roc_lda_selected, lda_results_selected$Probabilities),
Logistic = custom.metrics(glm_results$ConfusionMatrix, glm_fit, roc_glm, glm_results$Probabilities),
Logistic.FeatureSelected = custom.metrics(glm_results_selected$ConfusionMatrix, glm_fit_selected, roc_glm_selected, glm_results_selected$Probabilities),
RandomForest = custom.metrics(rf_results$ConfusionMatrix, rf_fit, roc_rf, rf_results$Probabilities),
kNN = custom.metrics(knn_results$ConfusionMatrix, knn_fit, roc_knn, knn_results$Probabilities)
)
round(data.frame(t(results_table)), 4)